Tidyverse

Author

Arpita Anand Prayag

1 Tidyverse Challenge

Analyze the sales by location (state) with a bar plot. Since state and city are multiple features (variables), they should be split. Which state has the highes revenue? Replace your bike_orderlines_wrangled_tbl object with the newly wrangled object (with the columns state and city).

2 Running Code

2.0 Load libraries

library(tidyverse)
#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#> ✔ dplyr     1.1.2     ✔ readr     2.1.4
#> ✔ forcats   1.0.0     ✔ stringr   1.5.0
#> ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
#> ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
#> ✔ purrr     1.0.1     
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag()    masks stats::lag()
#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(readxl)
library(lubridate)
library(writexl)

2.1 Importing Files

bikes_tbl <- read_excel(path ="C:/Users/arpit/OneDrive/Documents/GitHub/repo/ss23-bdsb-arpitaprayag/assets/data/ds_data/ds_data/01_bike_sales/01_raw_data/bikes.xlsx")
orderlines_tbl <- read_excel(path =  "C:/Users/arpit/OneDrive/Documents/GitHub/repo/ss23-bdsb-arpitaprayag/assets/data/ds_data/ds_data/01_bike_sales/01_raw_data/orderlines.xlsx")
#> New names:
#> • `` -> `...1`
bikeshops_tbl  <- read_excel("C:/Users/arpit/OneDrive/Documents/GitHub/repo/ss23-bdsb-arpitaprayag/assets/data/ds_data/ds_data/01_bike_sales/01_raw_data/bikeshops.xlsx")

2.2 Examining Data

# Print it to the console
orderlines_tbl

2.3 Joining Data

left_join(orderlines_tbl, bikes_tbl, by = c("product.id" = "bike.id"))
bike_orderlines_joined_tbl <- orderlines_tbl %>%
  left_join(bikes_tbl, by = c("product.id" = "bike.id")) %>%
  left_join(bikeshops_tbl, by = c("customer.id" = "bikeshop.id"))

bike_orderlines_joined_tbl

2.4 Wrangling Data

bike_states_wrangled_tbl <- bike_orderlines_joined_tbl %>%
  # 5.1 Separate state and city name
  separate(location, into = c("city", "state"), sep = ", ") %>%
  
  # 5.2 Add the total price (price * quantity) 
  # Add a column to a tibble that uses a formula-style calculation of other columns
  mutate(total.price = price * quantity)

sales_by_states_tbl <- bike_states_wrangled_tbl %>%
  
  # Select columns and add a year
  select(order.date, total.price, state) %>%
  mutate(year = year(order.date)) %>%
  
  # Group by and summarize year and states
  group_by(year, state) %>%
  summarise(sales = sum(total.price)) %>%
  ungroup() %>%
  
  # Format $ Text
  mutate(sales_text = scales::dollar(sales, big.mark = ".", 
                                     decimal.mark = ",", 
                                     prefix = "", 
                                     suffix = " €"))
#> `summarise()` has grouped output by 'year'. You can override using the
#> `.groups` argument.
sales_by_states_tbl 

3 Visualize

sales_by_states_tbl %>%
  
  # Set up x, y, fill
  ggplot(aes(x = year, y = sales, fill = state)) +
  
  # to your plotting code to rotate your x-axis labels
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  
  # Geometries
  geom_col(fill = "#2DC6D6") + # Use geom_col for a bar plot
  geom_label(aes(label = sales)) + # Adding labels to the bars
  geom_smooth(method = "lm", se = FALSE) + # Adding a trendline
  
  # Facet
  facet_wrap(~ state) +
  
  # Formatting
  scale_y_continuous(labels = scales::dollar_format(big.mark = ".", 
                                                    decimal.mark = ",", 
                                                    prefix = "", 
                                                    suffix = " €")) +
  labs(
    title = "Revenue by state",
    subtitle = "North Rhine-Westphalia has highest revenue",
    fill = "States" # Changes the legend name
  )
#> `geom_smooth()` using formula = 'y ~ x'